<!DOCTYPE HTML>
<html lang="en" >
    <!-- Start book Python爬虫课程讲义 -->
    <head>
        <!-- head:start -->
        <meta charset="UTF-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
        <title>入门案例 | Python爬虫课程讲义</title>
        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
        <meta name="description" content="">
        <meta name="generator" content="GitBook 2.6.7">
        <meta name="author" content="BigCat">
        
        <meta name="HandheldFriendly" content="true"/>
        <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
        <meta name="apple-mobile-web-app-capable" content="yes">
        <meta name="apple-mobile-web-app-status-bar-style" content="black">
        <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../../gitbook/images/apple-touch-icon-precomposed-152.png">
        <link rel="shortcut icon" href="../../gitbook/images/favicon.ico" type="image/x-icon">
        
    <link rel="stylesheet" href="../../gitbook/style.css">
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-tbfed-pagefooter/footer.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-splitter/splitter.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-highlight/website.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-fontsettings/website.css">
        
    
    

        
    
    
    <link rel="next" href="../../file/part04/4.3.html" />
    
    
    <link rel="prev" href="../../file/part04/4.1.html" />
    

        <!-- head:end -->
    </head>
    <body>
        <!-- body:start -->
        
    <div class="book"
        data-level="4.2"
        data-chapter-title="入门案例"
        data-filepath="file/part04/4.2.md"
        data-basepath="../.."
        data-revision="Thu Feb 09 2017 09:48:59 GMT+0800 (CST)"
        data-innerlanguage="">
    

<div class="book-summary">
    <nav role="navigation">
        <ul class="summary">
            
            
            
            

            

            
    
        <li class="chapter " data-level="0" data-path="index.html">
            
                
                    <a href="../../index.html">
                
                        <i class="fa fa-check"></i>
                        
                        传智播客Python学院爬虫课程
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1" data-path="file/part01/1.html">
            
                
                    <a href="../../file/part01/1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.</b>
                        
                        爬虫原理与数据抓取
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="1.1" data-path="file/part01/1.1.html">
            
                
                    <a href="../../file/part01/1.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.1.</b>
                        
                        (了解)通用爬虫和聚焦爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.2" data-path="file/part01/1.2.html">
            
                
                    <a href="../../file/part01/1.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.2.</b>
                        
                        (复习)HTTP/HTTPS的请求与响应
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.3" data-path="file/part01/1.3.html">
            
                
                    <a href="../../file/part01/1.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.3.</b>
                        
                        HTTP/HTTPS抓包工具-Fiddler
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.4" data-path="file/part01/1.4.html">
            
                
                    <a href="../../file/part01/1.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.4.</b>
                        
                        urllib2模块的基本使用
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.5" data-path="file/part01/1.5.html">
            
                
                    <a href="../../file/part01/1.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.5.</b>
                        
                        urllib2：GET请求和POST请求
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.6" data-path="file/part01/1.6.html">
            
                
                    <a href="../../file/part01/1.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.6.</b>
                        
                        urllib2：Handler处理器和自定义Opener
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.7" data-path="file/part01/1.7.html">
            
                
                    <a href="../../file/part01/1.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.7.</b>
                        
                        urllib2：URLError与HTTPError
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.8" data-path="file/part01/1.8.html">
            
                
                    <a href="../../file/part01/1.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.8.</b>
                        
                        Requests模块
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="2" data-path="file/part02/2.html">
            
                
                    <a href="../../file/part02/2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.</b>
                        
                        非结构化数据与结构化数据提取
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="2.1" data-path="file/part02/2.1.html">
            
                
                    <a href="../../file/part02/2.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.1.</b>
                        
                        正则表达式re模块
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.2" data-path="file/part02/2.2.html">
            
                
                    <a href="../../file/part02/2.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.2.</b>
                        
                        案例：使用正则表达式的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.3" data-path="file/part02/2.3.html">
            
                
                    <a href="../../file/part02/2.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.3.</b>
                        
                        XPath与lxml类库
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.4" data-path="file/part02/2.4.html">
            
                
                    <a href="../../file/part02/2.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.4.</b>
                        
                        案例：使用XPath的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.5" data-path="file/part02/2.5.html">
            
                
                    <a href="../../file/part02/2.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.5.</b>
                        
                        BeautifulSoup4 解析器
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.6" data-path="file/part02/2.6.html">
            
                
                    <a href="../../file/part02/2.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.6.</b>
                        
                        案例：使用bs4的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.7" data-path="file/part02/2.7.html">
            
                
                    <a href="../../file/part02/2.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.7.</b>
                        
                        JSON模块与JsonPath
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.8" data-path="file/part02/2.8.html">
            
                
                    <a href="../../file/part02/2.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.8.</b>
                        
                        糗事百科案例
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.9" data-path="file/part02/2.9.html">
            
                
                    <a href="../../file/part02/2.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.9.</b>
                        
                        多线程爬虫案例
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="3" data-path="file/part03/3.html">
            
                
                    <a href="../../file/part03/3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.</b>
                        
                        动态HTML处理和机器图像识别
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="3.1" data-path="file/part03/3.1.html">
            
                
                    <a href="../../file/part03/3.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.1.</b>
                        
                        动态HTML介绍
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.2" data-path="file/part03/3.2.html">
            
                
                    <a href="../../file/part03/3.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.2.</b>
                        
                        Selenium与PhantomJS
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.3" data-path="file/part03/3.3.html">
            
                
                    <a href="../../file/part03/3.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.3.</b>
                        
                        案例一：网站模拟登录
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.4" data-path="file/part03/3.4.html">
            
                
                    <a href="../../file/part03/3.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.4.</b>
                        
                        案例二：动态页面模拟点击
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.5" data-path="file/part03/3.5.html">
            
                
                    <a href="../../file/part03/3.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.5.</b>
                        
                        案例三：执行JavaScript语句
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.6" data-path="file/part03/3.6.html">
            
                
                    <a href="../../file/part03/3.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.6.</b>
                        
                        机器视觉与Tesseract介绍
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.7" data-path="file/part03/3.7.html">
            
                
                    <a href="../../file/part03/3.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.7.</b>
                        
                        处理一些格式规范的文字
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.8" data-path="file/part03/3.8.html">
            
                
                    <a href="../../file/part03/3.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.8.</b>
                        
                        案例：尝试对验证码进行机器识别处理
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.9" data-path="file/part03/3.9.html">
            
                
                    <a href="../../file/part03/3.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.9.</b>
                        
                        机器学习：训练Tesseract
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="4" data-path="file/part04/4.html">
            
                
                    <a href="../../file/part04/4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.</b>
                        
                        Scrapy框架
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="4.1" data-path="file/part04/4.1.html">
            
                
                    <a href="../../file/part04/4.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.1.</b>
                        
                        配置安装
                    </a>
            
            
        </li>
    
        <li class="chapter active" data-level="4.2" data-path="file/part04/4.2.html">
            
                
                    <a href="../../file/part04/4.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.2.</b>
                        
                        入门案例
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.3" data-path="file/part04/4.3.html">
            
                
                    <a href="../../file/part04/4.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.3.</b>
                        
                        Scrapy Shell
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.4" data-path="file/part04/4.4.html">
            
                
                    <a href="../../file/part04/4.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.4.</b>
                        
                        Item Pipeline
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.5" data-path="file/part04/4.5.html">
            
                
                    <a href="../../file/part04/4.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.5.</b>
                        
                        Spiders
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.6" data-path="file/part04/4.6.html">
            
                
                    <a href="../../file/part04/4.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.6.</b>
                        
                        CrawlSpiders
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.7" data-path="file/part04/4.7.html">
            
                
                    <a href="../../file/part04/4.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.7.</b>
                        
                        Request/Response
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.8" data-path="file/part04/4.8.html">
            
                
                    <a href="../../file/part04/4.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.8.</b>
                        
                        Downloader Middlewares
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.9" data-path="file/part04/4.9.html">
            
                
                    <a href="../../file/part04/4.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.9.</b>
                        
                        Settings
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="5" data-path="file/part05/5.html">
            
                
                    <a href="../../file/part05/5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.</b>
                        
                        Scrapy实战项目
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="5.1" data-path="file/part05/5.1.html">
            
                
                    <a href="../../file/part05/5.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.1.</b>
                        
                        (案例一)手机App抓包爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.2" data-path="file/part05/5.2.html">
            
                
                    <a href="../../file/part05/5.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.2.</b>
                        
                        (案例二)阳光热线问政平台爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.3" data-path="file/part05/5.3.html">
            
                
                    <a href="../../file/part05/5.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.3.</b>
                        
                        (案例三)新浪网分类资讯爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.4" data-path="file/part05/5.4.html">
            
                
                    <a href="../../file/part05/5.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.4.</b>
                        
                        (案例四)图片下载器爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.5" data-path="file/part05/5.5.html">
            
                
                    <a href="../../file/part05/5.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.5.</b>
                        
                        (案例五)将数据保存在MongoDB中
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.6" data-path="file/part05/5.6.html">
            
                
                    <a href="../../file/part05/5.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.6.</b>
                        
                        (案例六)三种scrapy模拟登陆策略
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.7" data-path="file/part05/5.7.html">
            
                
                    <a href="../../file/part05/5.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.7.</b>
                        
                        附：通过Fiddler进行手机抓包方法
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="6" data-path="file/part06/6.html">
            
                
                    <a href="../../file/part06/6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.</b>
                        
                        scrapy-redis分布式组件
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="6.1" data-path="file/part06/6.1.html">
            
                
                    <a href="../../file/part06/6.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.1.</b>
                        
                        源码分析参考：Connection
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.2" data-path="file/part06/6.2.html">
            
                
                    <a href="../../file/part06/6.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.2.</b>
                        
                        源码分析参考：Dupefilter
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.3" data-path="file/part06/6.3.html">
            
                
                    <a href="../../file/part06/6.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.3.</b>
                        
                        源码分析参考：Picklecompat
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.4" data-path="file/part06/6.4.html">
            
                
                    <a href="../../file/part06/6.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.4.</b>
                        
                        源码分析参考：Pipelines
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.5" data-path="file/part06/6.5.html">
            
                
                    <a href="../../file/part06/6.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.5.</b>
                        
                        源码分析参考：Queue
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.6" data-path="file/part06/6.6.html">
            
                
                    <a href="../../file/part06/6.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.6.</b>
                        
                        源码分析参考：Scheduler
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.7" data-path="file/part06/6.7.html">
            
                
                    <a href="../../file/part06/6.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.7.</b>
                        
                        源码分析参考：Spider
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="7" data-path="file/part07/7.html">
            
                
                    <a href="../../file/part07/7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.</b>
                        
                        scrapy-redis实战
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="7.1" data-path="file/part07/7.1.html">
            
                
                    <a href="../../file/part07/7.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.1.</b>
                        
                        源码自带项目说明
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.2" data-path="file/part07/7.2.html">
            
                
                    <a href="../../file/part07/7.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.2.</b>
                        
                        有缘网分布式爬虫项目1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.3" data-path="file/part07/7.3.html">
            
                
                    <a href="../../file/part07/7.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.3.</b>
                        
                        有缘网分布式爬虫项目2
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.4" data-path="file/part07/7.4.html">
            
                
                    <a href="../../file/part07/7.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.4.</b>
                        
                        处理Redis里的数据
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.5" data-path="file/part07/7.5.html">
            
                
                    <a href="../../file/part07/7.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.5.</b>
                        
                        尝试改写新浪网分类资讯爬虫1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.6" data-path="file/part07/7.6.html">
            
                
                    <a href="../../file/part07/7.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.6.</b>
                        
                        尝试改写新浪网分类资讯爬虫2
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.7" data-path="file/part07/7.7.html">
            
                
                    <a href="../../file/part07/7.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.7.</b>
                        
                        IT桔子分布式项目1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.8" data-path="file/part07/7.8.html">
            
                
                    <a href="../../file/part07/7.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.8.</b>
                        
                        IT桔子分布式项目2
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="8" data-path="file/duanzi/duanzi.html">
            
                
                    <a href="../../file/duanzi/duanzi.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>8.</b>
                        
                        课余段子
                    </a>
            
            
        </li>
    


            
            <li class="divider"></li>
            <li>
                <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
                    Published with GitBook
                </a>
            </li>
            
        </ul>
    </nav>
</div>

    <div class="book-body">
        <div class="body-inner">
            <div class="book-header" role="navigation">
    <!-- Actions Left -->
    

    <!-- Title -->
    <h1>
        <i class="fa fa-circle-o-notch fa-spin"></i>
        <a href="../../" >Python爬虫课程讲义</a>
    </h1>
</div>

            <div class="page-wrapper" tabindex="-1" role="main">
                <div class="page-inner">
                
                
                    <section class="normal" id="section-">
                    
                        <h1 id="&#x5165;&#x95E8;&#x6848;&#x4F8B;">&#x5165;&#x95E8;&#x6848;&#x4F8B;</h1>
<h2 id="&#x5B66;&#x4E60;&#x76EE;&#x6807;">&#x5B66;&#x4E60;&#x76EE;&#x6807;</h2>
<ul>
<li>&#x521B;&#x5EFA;&#x4E00;&#x4E2A;Scrapy&#x9879;&#x76EE;</li>
<li>&#x5B9A;&#x4E49;&#x63D0;&#x53D6;&#x7684;&#x7ED3;&#x6784;&#x5316;&#x6570;&#x636E;(Item)</li>
<li>&#x7F16;&#x5199;&#x722C;&#x53D6;&#x7F51;&#x7AD9;&#x7684; Spider &#x5E76;&#x63D0;&#x53D6;&#x51FA;&#x7ED3;&#x6784;&#x5316;&#x6570;&#x636E;(Item)</li>
<li>&#x7F16;&#x5199; Item Pipelines &#x6765;&#x5B58;&#x50A8;&#x63D0;&#x53D6;&#x5230;&#x7684;Item(&#x5373;&#x7ED3;&#x6784;&#x5316;&#x6570;&#x636E;)</li>
</ul>
<h2 id="&#x4E00;-&#x65B0;&#x5EFA;&#x9879;&#x76EE;scrapy-startproject">&#x4E00;. &#x65B0;&#x5EFA;&#x9879;&#x76EE;(scrapy startproject)</h2>
<ul>
<li>&#x5728;&#x5F00;&#x59CB;&#x722C;&#x53D6;&#x4E4B;&#x524D;&#xFF0C;&#x5FC5;&#x987B;&#x521B;&#x5EFA;&#x4E00;&#x4E2A;&#x65B0;&#x7684;Scrapy&#x9879;&#x76EE;&#x3002;&#x8FDB;&#x5165;&#x81EA;&#x5B9A;&#x4E49;&#x7684;&#x9879;&#x76EE;&#x76EE;&#x5F55;&#x4E2D;&#xFF0C;&#x8FD0;&#x884C;&#x4E0B;&#x5217;&#x547D;&#x4EE4;&#xFF1A;</li>
</ul>
<pre><code>scrapy startproject mySpider
</code></pre><ul>
<li>&#x5176;&#x4E2D;&#xFF0C; mySpider &#x4E3A;&#x9879;&#x76EE;&#x540D;&#x79F0;&#xFF0C;&#x53EF;&#x4EE5;&#x770B;&#x5230;&#x5C06;&#x4F1A;&#x521B;&#x5EFA;&#x4E00;&#x4E2A; mySpider &#x6587;&#x4EF6;&#x5939;&#xFF0C;&#x76EE;&#x5F55;&#x7ED3;&#x6784;&#x5927;&#x81F4;&#x5982;&#x4E0B;&#xFF1A;</li>
</ul>
<p><img src="../images/7.2.png" alt=""></p>
<p>&#x4E0B;&#x9762;&#x6765;&#x7B80;&#x5355;&#x4ECB;&#x7ECD;&#x4E00;&#x4E0B;&#x5404;&#x4E2A;&#x4E3B;&#x8981;&#x6587;&#x4EF6;&#x7684;&#x4F5C;&#x7528;&#xFF1A;</p>
<blockquote>
<p>scrapy.cfg &#xFF1A;&#x9879;&#x76EE;&#x7684;&#x914D;&#x7F6E;&#x6587;&#x4EF6;</p>
<p>mySpider/ &#xFF1A;&#x9879;&#x76EE;&#x7684;Python&#x6A21;&#x5757;&#xFF0C;&#x5C06;&#x4F1A;&#x4ECE;&#x8FD9;&#x91CC;&#x5F15;&#x7528;&#x4EE3;&#x7801;</p>
<p>mySpider/items.py &#xFF1A;&#x9879;&#x76EE;&#x7684;&#x76EE;&#x6807;&#x6587;&#x4EF6;</p>
<p>mySpider/pipelines.py &#xFF1A;&#x9879;&#x76EE;&#x7684;&#x7BA1;&#x9053;&#x6587;&#x4EF6;</p>
<p>mySpider/settings.py &#xFF1A;&#x9879;&#x76EE;&#x7684;&#x8BBE;&#x7F6E;&#x6587;&#x4EF6;</p>
<p>mySpider/spiders/ &#xFF1A;&#x5B58;&#x50A8;&#x722C;&#x866B;&#x4EE3;&#x7801;&#x76EE;&#x5F55;</p>
</blockquote>
<h2 id="&#x4E8C;&#x3001;&#x660E;&#x786E;&#x76EE;&#x6807;myspideritemspy">&#x4E8C;&#x3001;&#x660E;&#x786E;&#x76EE;&#x6807;(mySpider/items.py)</h2>
<p>&#x6211;&#x4EEC;&#x6253;&#x7B97;&#x6293;&#x53D6;&#xFF1A;<a href="http://www.itcast.cn/channel/teacher.shtml" target="_blank">http://www.itcast.cn/channel/teacher.shtml</a> &#x7F51;&#x7AD9;&#x91CC;&#x7684;&#x6240;&#x6709;&#x8BB2;&#x5E08;&#x7684;&#x59D3;&#x540D;&#x3001;&#x804C;&#x79F0;&#x548C;&#x4E2A;&#x4EBA;&#x4FE1;&#x606F;&#x3002;</p>
<ol>
<li><p>&#x6253;&#x5F00;mySpider&#x76EE;&#x5F55;&#x4E0B;&#x7684;items.py</p>
</li>
<li><p>Item &#x5B9A;&#x4E49;&#x7ED3;&#x6784;&#x5316;&#x6570;&#x636E;&#x5B57;&#x6BB5;&#xFF0C;&#x7528;&#x6765;&#x4FDD;&#x5B58;&#x722C;&#x53D6;&#x5230;&#x7684;&#x6570;&#x636E;&#xFF0C;&#x6709;&#x70B9;&#x50CF;Python&#x4E2D;&#x7684;dict&#xFF0C;&#x4F46;&#x662F;&#x63D0;&#x4F9B;&#x4E86;&#x4E00;&#x4E9B;&#x989D;&#x5916;&#x7684;&#x4FDD;&#x62A4;&#x51CF;&#x5C11;&#x9519;&#x8BEF;&#x3002;</p>
</li>
<li><p>&#x53EF;&#x4EE5;&#x901A;&#x8FC7;&#x521B;&#x5EFA;&#x4E00;&#x4E2A; scrapy.Item &#x7C7B;&#xFF0C; &#x5E76;&#x4E14;&#x5B9A;&#x4E49;&#x7C7B;&#x578B;&#x4E3A; scrapy.Field&#x7684;&#x7C7B;&#x5C5E;&#x6027;&#x6765;&#x5B9A;&#x4E49;&#x4E00;&#x4E2A;Item&#xFF08;&#x53EF;&#x4EE5;&#x7406;&#x89E3;&#x6210;&#x7C7B;&#x4F3C;&#x4E8E;ORM&#x7684;&#x6620;&#x5C04;&#x5173;&#x7CFB;&#xFF09;&#x3002;</p>
</li>
<li><p>&#x63A5;&#x4E0B;&#x6765;&#xFF0C;&#x521B;&#x5EFA;&#x4E00;&#x4E2A;ItcastItem &#x7C7B;&#xFF0C;&#x548C;&#x6784;&#x5EFA;item&#x6A21;&#x578B;&#xFF08;model&#xFF09;&#x3002;</p>
</li>
</ol>
<pre><code class="lang-python"><span class="hljs-keyword">import</span> scrapy

<span class="hljs-class"><span class="hljs-keyword">class</span> <span class="hljs-title">ItcastItem</span><span class="hljs-params">(scrapy.Item)</span>:</span>
    name = scrapy.Field()
    level = scrapy.Field()
    info = scrapy.Field()
</code></pre>
<h2 id="&#x4E09;&#x3001;&#x5236;&#x4F5C;&#x722C;&#x866B;-&#xFF08;spidersitcastspiderpy&#xFF09;">&#x4E09;&#x3001;&#x5236;&#x4F5C;&#x722C;&#x866B; &#xFF08;spiders/itcastSpider.py&#xFF09;</h2>
<p><strong>&#x722C;&#x866B;&#x529F;&#x80FD;&#x8981;&#x5206;&#x4E24;&#x6B65;&#xFF1A;</strong></p>
<h3 id="1-&#x722C;&#x6570;&#x636E;">1. &#x722C;&#x6570;&#x636E;</h3>
<ul>
<li>&#x5728;&#x5F53;&#x524D;&#x76EE;&#x5F55;&#x4E0B;&#x8F93;&#x5165;&#x547D;&#x4EE4;&#xFF0C;&#x5C06;&#x5728;<code>mySpider/spider</code>&#x76EE;&#x5F55;&#x4E0B;&#x521B;&#x5EFA;&#x4E00;&#x4E2A;&#x540D;&#x4E3A;<code>itcast</code>&#x7684;&#x722C;&#x866B;&#xFF0C;&#x5E76;&#x6307;&#x5B9A;&#x722C;&#x53D6;&#x57DF;&#x7684;&#x8303;&#x56F4;&#xFF1A;</li>
</ul>
<pre><code>scrapy genspider itcast &quot;itcast.cn&quot;
</code></pre><ul>
<li>&#x6253;&#x5F00; mySpider/spider&#x76EE;&#x5F55;&#x91CC;&#x7684; itcast.py&#xFF0C;&#x9ED8;&#x8BA4;&#x589E;&#x52A0;&#x4E86;&#x4E0B;&#x5217;&#x4EE3;&#x7801;:</li>
</ul>
<pre><code class="lang-python"><span class="hljs-keyword">import</span> scrapy

<span class="hljs-class"><span class="hljs-keyword">class</span> <span class="hljs-title">ItcastSpider</span><span class="hljs-params">(scrapy.Spider)</span>:</span>
    name = <span class="hljs-string">&quot;itcast&quot;</span>
    allowed_domains = [<span class="hljs-string">&quot;itcast.cn&quot;</span>]
    start_urls = (
        <span class="hljs-string">&apos;http://www.itcast.cn/&apos;</span>,
    )

    <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">parse</span><span class="hljs-params">(self, response)</span>:</span>
        <span class="hljs-keyword">pass</span>
</code></pre>
<h6 id="&#x5176;&#x5B9E;&#x4E5F;&#x53EF;&#x4EE5;&#x7531;&#x6211;&#x4EEC;&#x81EA;&#x884C;&#x521B;&#x5EFA;itcastpy&#x5E76;&#x7F16;&#x5199;&#x4E0A;&#x9762;&#x7684;&#x4EE3;&#x7801;&#xFF0C;&#x53EA;&#x4E0D;&#x8FC7;&#x4F7F;&#x7528;&#x547D;&#x4EE4;&#x53EF;&#x4EE5;&#x514D;&#x53BB;&#x7F16;&#x5199;&#x56FA;&#x5B9A;&#x4EE3;&#x7801;&#x7684;&#x9EBB;&#x70E6;">&#x5176;&#x5B9E;&#x4E5F;&#x53EF;&#x4EE5;&#x7531;&#x6211;&#x4EEC;&#x81EA;&#x884C;&#x521B;&#x5EFA;itcast.py&#x5E76;&#x7F16;&#x5199;&#x4E0A;&#x9762;&#x7684;&#x4EE3;&#x7801;&#xFF0C;&#x53EA;&#x4E0D;&#x8FC7;&#x4F7F;&#x7528;&#x547D;&#x4EE4;&#x53EF;&#x4EE5;&#x514D;&#x53BB;&#x7F16;&#x5199;&#x56FA;&#x5B9A;&#x4EE3;&#x7801;&#x7684;&#x9EBB;&#x70E6;</h6>
<p>&#x8981;&#x5EFA;&#x7ACB;&#x4E00;&#x4E2A;Spider&#xFF0C; &#x4F60;&#x5FC5;&#x987B;&#x7528;scrapy.Spider&#x7C7B;&#x521B;&#x5EFA;&#x4E00;&#x4E2A;&#x5B50;&#x7C7B;&#xFF0C;&#x5E76;&#x786E;&#x5B9A;&#x4E86;&#x4E09;&#x4E2A;&#x5F3A;&#x5236;&#x7684;&#x5C5E;&#x6027; &#x548C; &#x4E00;&#x4E2A;&#x65B9;&#x6CD5;&#x3002;</p>
<ul>
<li><p><code>name = &quot;&quot;</code> &#xFF1A;&#x8FD9;&#x4E2A;&#x722C;&#x866B;&#x7684;&#x8BC6;&#x522B;&#x540D;&#x79F0;&#xFF0C;&#x5FC5;&#x987B;&#x662F;&#x552F;&#x4E00;&#x7684;&#xFF0C;&#x5728;&#x4E0D;&#x540C;&#x7684;&#x722C;&#x866B;&#x5FC5;&#x987B;&#x5B9A;&#x4E49;&#x4E0D;&#x540C;&#x7684;&#x540D;&#x5B57;&#x3002;</p>
</li>
<li><p><code>allow_domains = []</code> &#x662F;&#x641C;&#x7D22;&#x7684;&#x57DF;&#x540D;&#x8303;&#x56F4;&#xFF0C;&#x4E5F;&#x5C31;&#x662F;&#x722C;&#x866B;&#x7684;&#x7EA6;&#x675F;&#x533A;&#x57DF;&#xFF0C;&#x89C4;&#x5B9A;&#x722C;&#x866B;&#x53EA;&#x722C;&#x53D6;&#x8FD9;&#x4E2A;&#x57DF;&#x540D;&#x4E0B;&#x7684;&#x7F51;&#x9875;&#xFF0C;&#x4E0D;&#x5B58;&#x5728;&#x7684;URL&#x4F1A;&#x88AB;&#x5FFD;&#x7565;&#x3002;</p>
</li>
<li><p><code>start_urls = ()</code> &#xFF1A;&#x722C;&#x53D6;&#x7684;URL&#x5143;&#x7956;/&#x5217;&#x8868;&#x3002;&#x722C;&#x866B;&#x4ECE;&#x8FD9;&#x91CC;&#x5F00;&#x59CB;&#x6293;&#x53D6;&#x6570;&#x636E;&#xFF0C;&#x6240;&#x4EE5;&#xFF0C;&#x7B2C;&#x4E00;&#x6B21;&#x4E0B;&#x8F7D;&#x7684;&#x6570;&#x636E;&#x5C06;&#x4F1A;&#x4ECE;&#x8FD9;&#x4E9B;urls&#x5F00;&#x59CB;&#x3002;&#x5176;&#x4ED6;&#x5B50;URL&#x5C06;&#x4F1A;&#x4ECE;&#x8FD9;&#x4E9B;&#x8D77;&#x59CB;URL&#x4E2D;&#x7EE7;&#x627F;&#x6027;&#x751F;&#x6210;&#x3002;</p>
</li>
<li><p><code>parse(self, response)</code> &#xFF1A;&#x89E3;&#x6790;&#x7684;&#x65B9;&#x6CD5;&#xFF0C;&#x6BCF;&#x4E2A;&#x521D;&#x59CB;URL&#x5B8C;&#x6210;&#x4E0B;&#x8F7D;&#x540E;&#x5C06;&#x88AB;&#x8C03;&#x7528;&#xFF0C;&#x8C03;&#x7528;&#x7684;&#x65F6;&#x5019;&#x4F20;&#x5165;&#x4ECE;&#x6BCF;&#x4E00;&#x4E2A;URL&#x4F20;&#x56DE;&#x7684;Response&#x5BF9;&#x8C61;&#x6765;&#x4F5C;&#x4E3A;&#x552F;&#x4E00;&#x53C2;&#x6570;&#xFF0C;&#x4E3B;&#x8981;&#x4F5C;&#x7528;&#x5982;&#x4E0B;&#xFF1A;</p>
<ol>
<li>&#x8D1F;&#x8D23;&#x89E3;&#x6790;&#x8FD4;&#x56DE;&#x7684;&#x7F51;&#x9875;&#x6570;&#x636E;(response.body)&#xFF0C;&#x63D0;&#x53D6;&#x7ED3;&#x6784;&#x5316;&#x6570;&#x636E;(&#x751F;&#x6210;item)</li>
<li>&#x751F;&#x6210;&#x9700;&#x8981;&#x4E0B;&#x4E00;&#x9875;&#x7684;URL&#x8BF7;&#x6C42;&#x3002;</li>
</ol>
</li>
</ul>
<h5 id="&#x5C06;starturls&#x7684;&#x503C;&#x4FEE;&#x6539;&#x4E3A;&#x9700;&#x8981;&#x722C;&#x53D6;&#x7684;&#x7B2C;&#x4E00;&#x4E2A;url">&#x5C06;start_urls&#x7684;&#x503C;&#x4FEE;&#x6539;&#x4E3A;&#x9700;&#x8981;&#x722C;&#x53D6;&#x7684;&#x7B2C;&#x4E00;&#x4E2A;url</h5>
<pre><code class="lang-python">start_urls = (<span class="hljs-string">&quot;http://www.itcast.cn/channel/teacher.shtml&quot;</span>,)
</code></pre>
<h5 id="&#x4FEE;&#x6539;parse&#x65B9;&#x6CD5;">&#x4FEE;&#x6539;parse()&#x65B9;&#x6CD5;</h5>
<pre><code class="lang-python"><span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">parse</span><span class="hljs-params">(self, response)</span>:</span>
    filename = <span class="hljs-string">&quot;teacher.html&quot;</span>
    open(filename, <span class="hljs-string">&apos;w&apos;</span>).write(response.body)
</code></pre>
<p>&#x7136;&#x540E;&#x8FD0;&#x884C;&#x4E00;&#x4E0B;&#x770B;&#x770B;&#xFF0C;&#x5728;mySpider&#x76EE;&#x5F55;&#x4E0B;&#x6267;&#x884C;&#xFF1A;</p>
<pre><code class="lang-python">scrapy crawl itcast
</code></pre>
<p>&#x662F;&#x7684;&#xFF0C;&#x5C31;&#x662F; itcast&#xFF0C;&#x770B;&#x4E0A;&#x9762;&#x4EE3;&#x7801;&#xFF0C;&#x5B83;&#x662F; ItcastSpider &#x7C7B;&#x7684; name &#x5C5E;&#x6027;&#xFF0C;&#x4E5F;&#x5C31;&#x662F;&#x4F7F;&#x7528; <code>scrapy genspider</code>&#x547D;&#x4EE4;&#x7684;&#x552F;&#x4E00;&#x722C;&#x866B;&#x540D;&#x3002;</p>
<p>&#x8FD0;&#x884C;&#x4E4B;&#x540E;&#xFF0C;&#x5982;&#x679C;&#x6253;&#x5370;&#x7684;&#x65E5;&#x5FD7;&#x51FA;&#x73B0; 
<code>[scrapy] INFO: Spider closed (finished)</code>&#xFF0C;&#x4EE3;&#x8868;&#x6267;&#x884C;&#x5B8C;&#x6210;&#x3002; &#x4E4B;&#x540E;&#x5F53;&#x524D;&#x6587;&#x4EF6;&#x5939;&#x4E2D;&#x5C31;&#x51FA;&#x73B0;&#x4E86;&#x4E00;&#x4E2A; teacher.html &#x6587;&#x4EF6;&#xFF0C;&#x91CC;&#x9762;&#x5C31;&#x662F;&#x6211;&#x4EEC;&#x521A;&#x521A;&#x8981;&#x722C;&#x53D6;&#x7684;&#x7F51;&#x9875;&#x7684;&#x5168;&#x90E8;&#x6E90;&#x4EE3;&#x7801;&#x4FE1;&#x606F;&#x3002;</p>
<pre><code class="lang-python"><span class="hljs-comment"># &#x6CE8;&#x610F;&#xFF0C;Python2.x&#x9ED8;&#x8BA4;&#x7F16;&#x7801;&#x73AF;&#x5883;&#x662F;ASCII&#xFF0C;&#x5F53;&#x548C;&#x53D6;&#x56DE;&#x7684;&#x6570;&#x636E;&#x7F16;&#x7801;&#x683C;&#x5F0F;&#x4E0D;&#x4E00;&#x81F4;&#x65F6;&#xFF0C;&#x53EF;&#x80FD;&#x4F1A;&#x9020;&#x6210;&#x4E71;&#x7801;&#xFF1B;</span>
<span class="hljs-comment"># &#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x6307;&#x5B9A;&#x4FDD;&#x5B58;&#x5185;&#x5BB9;&#x7684;&#x7F16;&#x7801;&#x683C;&#x5F0F;&#xFF0C;&#x4E00;&#x822C;&#x60C5;&#x51B5;&#x4E0B;&#xFF0C;&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x5728;&#x4EE3;&#x7801;&#x6700;&#x4E0A;&#x65B9;&#x6DFB;&#x52A0;&#xFF1A;</span>

    <span class="hljs-keyword">import</span> sys
    reload(sys)
    sys.setdefaultencoding(<span class="hljs-string">&quot;utf-8&quot;</span>)

<span class="hljs-comment"># &#x8FD9;&#x4E09;&#x884C;&#x4EE3;&#x7801;&#x662F;Python2.x&#x91CC;&#x89E3;&#x51B3;&#x4E2D;&#x6587;&#x7F16;&#x7801;&#x7684;&#x4E07;&#x80FD;&#x94A5;&#x5319;&#xFF0C;&#x7ECF;&#x8FC7;&#x8FD9;&#x4E48;&#x591A;&#x5E74;&#x7684;&#x5410;&#x69FD;&#x540E;Python3&#x5B66;&#x4E56;&#x4E86;&#xFF0C;&#x9ED8;&#x8BA4;&#x7F16;&#x7801;&#x662F;Unicode&#x4E86;...(&#x795D;&#x5927;&#x5BB6;&#x65E9;&#x65E5;&#x62E5;&#x62B1;Python3)</span>
</code></pre>
<hr>
<h3 id="2-&#x53D6;&#x6570;&#x636E;">2. &#x53D6;&#x6570;&#x636E;</h3>
<ul>
<li>&#x722C;&#x53D6;&#x6574;&#x4E2A;&#x7F51;&#x9875;&#x5B8C;&#x6BD5;&#xFF0C;&#x63A5;&#x4E0B;&#x6765;&#x7684;&#x5C31;&#x662F;&#x7684;&#x53D6;&#x8FC7;&#x7A0B;&#x4E86;&#xFF0C;&#x9996;&#x5148;&#x89C2;&#x5BDF;&#x9875;&#x9762;&#x6E90;&#x7801;&#xFF1A;</li>
</ul>
<p><img src="../images/teacher_html.png" alt=""></p>
<pre><code class="lang-html"><span class="hljs-tag">&lt;<span class="hljs-title">div</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;li_txt&quot;</span>&gt;</span>
    <span class="hljs-tag">&lt;<span class="hljs-title">h3</span>&gt;</span>  xxx  <span class="hljs-tag">&lt;/<span class="hljs-title">h3</span>&gt;</span>
    <span class="hljs-tag">&lt;<span class="hljs-title">h4</span>&gt;</span> xxxxx <span class="hljs-tag">&lt;/<span class="hljs-title">h4</span>&gt;</span>
    <span class="hljs-tag">&lt;<span class="hljs-title">p</span>&gt;</span> xxxxxxxx <span class="hljs-tag">&lt;/<span class="hljs-title">p</span>&gt;</span>
</code></pre>
<h4 id="&#x662F;&#x4E0D;&#x662F;&#x4E00;&#x76EE;&#x4E86;&#x7136;&#xFF1F;&#x76F4;&#x63A5;&#x4E0A;xpath&#x5F00;&#x59CB;&#x63D0;&#x53D6;&#x6570;&#x636E;&#x5427;&#x3002;">&#x662F;&#x4E0D;&#x662F;&#x4E00;&#x76EE;&#x4E86;&#x7136;&#xFF1F;&#x76F4;&#x63A5;&#x4E0A;XPath&#x5F00;&#x59CB;&#x63D0;&#x53D6;&#x6570;&#x636E;&#x5427;&#x3002;</h4>
<ul>
<li>&#x6211;&#x4EEC;&#x4E4B;&#x524D;&#x5728;mySpider/items.py &#x91CC;&#x5B9A;&#x4E49;&#x4E86;&#x4E00;&#x4E2A;ItcastItem&#x7C7B;&#x3002; &#x8FD9;&#x91CC;&#x5F15;&#x5165;&#x8FDB;&#x6765;</li>
</ul>
<pre><code class="lang-python">  <span class="hljs-keyword">from</span> mySpider.items <span class="hljs-keyword">import</span> ItcastItem
</code></pre>
<ul>
<li>&#x7136;&#x540E;&#x5C06;&#x6211;&#x4EEC;&#x5F97;&#x5230;&#x7684;&#x6570;&#x636E;&#x5C01;&#x88C5;&#x5230;&#x4E00;&#x4E2A; <code>ItcastItem</code> &#x5BF9;&#x8C61;&#x4E2D;&#xFF0C;&#x53EF;&#x4EE5;&#x4FDD;&#x5B58;&#x6BCF;&#x4E2A;&#x8001;&#x5E08;&#x7684;&#x5C5E;&#x6027;&#xFF1A;</li>
</ul>
<pre><code class="lang-python"><span class="hljs-keyword">from</span> mySpider.items <span class="hljs-keyword">import</span> ItcastItem

<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">parse</span><span class="hljs-params">(self, response)</span>:</span>
    <span class="hljs-comment">#open(&quot;teacher.html&quot;,&quot;wb&quot;).write(response.body).close()</span>

    <span class="hljs-comment"># &#x5B58;&#x653E;&#x8001;&#x5E08;&#x4FE1;&#x606F;&#x7684;&#x96C6;&#x5408;</span>
    items = []

    <span class="hljs-keyword">for</span> each <span class="hljs-keyword">in</span> response.xpath(<span class="hljs-string">&quot;//div[@class=&apos;li_txt&apos;]&quot;</span>):
        <span class="hljs-comment"># &#x5C06;&#x6211;&#x4EEC;&#x5F97;&#x5230;&#x7684;&#x6570;&#x636E;&#x5C01;&#x88C5;&#x5230;&#x4E00;&#x4E2A; `ItcastItem` &#x5BF9;&#x8C61;</span>
        item = ItcastItem()
        <span class="hljs-comment">#extract()&#x65B9;&#x6CD5;&#x8FD4;&#x56DE;&#x7684;&#x90FD;&#x662F;unicode&#x5B57;&#x7B26;&#x4E32;</span>
        name = each.xpath(<span class="hljs-string">&quot;h3/text()&quot;</span>).extract()
        title = each.xpath(<span class="hljs-string">&quot;h4/text()&quot;</span>).extract()
        info = each.xpath(<span class="hljs-string">&quot;p/text()&quot;</span>).extract()

        <span class="hljs-comment">#xpath&#x8FD4;&#x56DE;&#x7684;&#x662F;&#x5305;&#x542B;&#x4E00;&#x4E2A;&#x5143;&#x7D20;&#x7684;&#x5217;&#x8868;</span>
        item[<span class="hljs-string">&apos;name&apos;</span>] = name[<span class="hljs-number">0</span>]
        item[<span class="hljs-string">&apos;title&apos;</span>] = title[<span class="hljs-number">0</span>]
        item[<span class="hljs-string">&apos;info&apos;</span>] = info[<span class="hljs-number">0</span>]

        items.append(item)

    <span class="hljs-comment"># &#x76F4;&#x63A5;&#x8FD4;&#x56DE;&#x6700;&#x540E;&#x6570;&#x636E;</span>
    <span class="hljs-keyword">return</span> items
</code></pre>
<ul>
<li>&#x6211;&#x4EEC;&#x6682;&#x65F6;&#x5148;&#x4E0D;&#x5904;&#x7406;&#x7BA1;&#x9053;&#xFF0C;&#x540E;&#x9762;&#x4F1A;&#x8BE6;&#x7EC6;&#x4ECB;&#x7ECD;&#x3002;</li>
</ul>
<h2 id="&#x4FDD;&#x5B58;&#x6570;&#x636E;">&#x4FDD;&#x5B58;&#x6570;&#x636E;</h2>
<h5 id="scrapy&#x4FDD;&#x5B58;&#x4FE1;&#x606F;&#x7684;&#x6700;&#x7B80;&#x5355;&#x7684;&#x65B9;&#x6CD5;&#x4E3B;&#x8981;&#x6709;&#x56DB;&#x79CD;&#xFF0C;o-&#x8F93;&#x51FA;&#x6307;&#x5B9A;&#x683C;&#x5F0F;&#x7684;&#x6587;&#x4EF6;&#xFF0C;&#xFF0C;&#x547D;&#x4EE4;&#x5982;&#x4E0B;&#xFF1A;">scrapy&#x4FDD;&#x5B58;&#x4FE1;&#x606F;&#x7684;&#x6700;&#x7B80;&#x5355;&#x7684;&#x65B9;&#x6CD5;&#x4E3B;&#x8981;&#x6709;&#x56DB;&#x79CD;&#xFF0C;-o &#x8F93;&#x51FA;&#x6307;&#x5B9A;&#x683C;&#x5F0F;&#x7684;&#x6587;&#x4EF6;&#xFF0C;&#xFF0C;&#x547D;&#x4EE4;&#x5982;&#x4E0B;&#xFF1A;</h5>
<pre><code class="lang-python"><span class="hljs-comment"># json&#x683C;&#x5F0F;&#xFF0C;&#x9ED8;&#x8BA4;&#x4E3A;Unicode&#x7F16;&#x7801;</span>
scrapy crawl itcast -o teachers.json

<span class="hljs-comment"># json lines&#x683C;&#x5F0F;&#xFF0C;&#x9ED8;&#x8BA4;&#x4E3A;Unicode&#x7F16;&#x7801;</span>
scrapy crawl itcast -o teachers.jsonl

<span class="hljs-comment"># csv &#x9017;&#x53F7;&#x8868;&#x8FBE;&#x5F0F;&#xFF0C;&#x53EF;&#x7528;Excel&#x6253;&#x5F00;</span>
scrapy crawl itcast -o teachers.csv

<span class="hljs-comment"># xml&#x683C;&#x5F0F;</span>
scrapy crawl itcast -o teachers.xml
</code></pre>
<hr>
<h2 id="&#x601D;&#x8003;">&#x601D;&#x8003;</h2>
<h4 id="&#x5982;&#x679C;&#x5C06;&#x4EE3;&#x7801;&#x6539;&#x6210;&#x4E0B;&#x9762;&#x5F62;&#x5F0F;&#xFF0C;&#x7ED3;&#x679C;&#x5B8C;&#x5168;&#x4E00;&#x6837;&#x3002;">&#x5982;&#x679C;&#x5C06;&#x4EE3;&#x7801;&#x6539;&#x6210;&#x4E0B;&#x9762;&#x5F62;&#x5F0F;&#xFF0C;&#x7ED3;&#x679C;&#x5B8C;&#x5168;&#x4E00;&#x6837;&#x3002;</h4>
<h4 id="&#x8BF7;&#x601D;&#x8003;-yield-&#x5728;&#x8FD9;&#x91CC;&#x7684;&#x4F5C;&#x7528;&#xFF1A;">&#x8BF7;&#x601D;&#x8003; yield &#x5728;&#x8FD9;&#x91CC;&#x7684;&#x4F5C;&#x7528;&#xFF1A;</h4>
<pre><code class="lang-python"><span class="hljs-keyword">from</span> mySpider.items <span class="hljs-keyword">import</span> ItcastItem

<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">parse</span><span class="hljs-params">(self, response)</span>:</span>
    <span class="hljs-comment">#open(&quot;teacher.html&quot;,&quot;wb&quot;).write(response.body).close()</span>

    <span class="hljs-comment"># &#x5B58;&#x653E;&#x8001;&#x5E08;&#x4FE1;&#x606F;&#x7684;&#x96C6;&#x5408;</span>
    <span class="hljs-comment">#items = []</span>

    <span class="hljs-keyword">for</span> each <span class="hljs-keyword">in</span> response.xpath(<span class="hljs-string">&quot;//div[@class=&apos;li_txt&apos;]&quot;</span>):
        <span class="hljs-comment"># &#x5C06;&#x6211;&#x4EEC;&#x5F97;&#x5230;&#x7684;&#x6570;&#x636E;&#x5C01;&#x88C5;&#x5230;&#x4E00;&#x4E2A; `ItcastItem` &#x5BF9;&#x8C61;</span>
        item = ItcastItem()
        <span class="hljs-comment">#extract()&#x65B9;&#x6CD5;&#x8FD4;&#x56DE;&#x7684;&#x90FD;&#x662F;unicode&#x5B57;&#x7B26;&#x4E32;</span>
        name = each.xpath(<span class="hljs-string">&quot;h3/text()&quot;</span>).extract()
        title = each.xpath(<span class="hljs-string">&quot;h4/text()&quot;</span>).extract()
        info = each.xpath(<span class="hljs-string">&quot;p/text()&quot;</span>).extract()

        <span class="hljs-comment">#xpath&#x8FD4;&#x56DE;&#x7684;&#x662F;&#x5305;&#x542B;&#x4E00;&#x4E2A;&#x5143;&#x7D20;&#x7684;&#x5217;&#x8868;</span>
        item[<span class="hljs-string">&apos;name&apos;</span>] = name[<span class="hljs-number">0</span>]
        item[<span class="hljs-string">&apos;title&apos;</span>] = title[<span class="hljs-number">0</span>]
        item[<span class="hljs-string">&apos;info&apos;</span>] = info[<span class="hljs-number">0</span>]

        <span class="hljs-comment">#items.append(item)</span>

        <span class="hljs-comment">#&#x5C06;&#x83B7;&#x53D6;&#x7684;&#x6570;&#x636E;&#x4EA4;&#x7ED9;pipelines</span>
        <span class="hljs-keyword">yield</span> item

    <span class="hljs-comment"># &#x8FD4;&#x56DE;&#x6570;&#x636E;&#xFF0C;&#x4E0D;&#x7ECF;&#x8FC7;pipeline</span>
    <span class="hljs-comment">#return items</span>
</code></pre>
<footer class="page-footer"><span class="copyright">Copyright &#xA9; BigCat all right reserved&#xFF0C;powered by Gitbook</span><span class="footer-modification">&#x300C;Revision Time:
2017-02-05 22:51:12&#x300D;
</span></footer>
                    
                    </section>
                
                
                </div>
            </div>
        </div>

        
        <a href="../../file/part04/4.1.html" class="navigation navigation-prev " aria-label="Previous page: 配置安装"><i class="fa fa-angle-left"></i></a>
        
        
        <a href="../../file/part04/4.3.html" class="navigation navigation-next " aria-label="Next page: Scrapy Shell"><i class="fa fa-angle-right"></i></a>
        
    </div>
</div>

        
<script src="../../gitbook/app.js"></script>

    
    <script src="../../gitbook/plugins/gitbook-plugin-splitter/splitter.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-fontsettings/buttons.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-livereload/plugin.js"></script>
    

<script>
require(["gitbook"], function(gitbook) {
    var config = {"disqus":{"shortName":"gitbookuse"},"github":{"url":"https://github.com/dododream"},"search-pro":{"cutWordLib":"nodejieba","defineWord":["gitbook-use"]},"sharing":{"weibo":true,"facebook":true,"twitter":true,"google":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"tbfed-pagefooter":{"copyright":"Copyright © BigCat","modify_label":"「Revision Time:","modify_format":"YYYY-MM-DD HH:mm:ss」"},"baidu":{"token":"ff100361cdce95dd4c8fb96b4009f7bc"},"sitemap":{"hostname":"http://www.treenewbee.top"},"donate":{"wechat":"http://weixin.png","alipay":"http://alipay.png","title":"","button":"赏","alipayText":"支付宝打赏","wechatText":"微信打赏"},"edit-link":{"base":"https://github.com/dododream/edit","label":"Edit This Page"},"splitter":{},"toggle-chapters":{},"highlight":{},"fontsettings":{"theme":"white","family":"sans","size":2},"livereload":{}};
    gitbook.start(config);
});
</script>

        <!-- body:end -->
    </body>
    <!-- End of book Python爬虫课程讲义 -->
</html>
